package au.com.acpfg.misc.picr; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.datatype.DatatypeConstants; import javax.xml.datatype.XMLGregorianCalendar; import org.knime.core.data.DataCell; import org.knime.core.data.DataColumnSpec; import org.knime.core.data.DataColumnSpecCreator; import org.knime.core.data.DataRow; import org.knime.core.data.DataTableSpec; import org.knime.core.data.DataType; import org.knime.core.data.RowIterator; import org.knime.core.data.date.DateAndTimeCell; import org.knime.core.data.def.DefaultRow; import org.knime.core.data.def.StringCell; import org.knime.core.node.BufferedDataContainer; import org.knime.core.node.BufferedDataTable; import org.knime.core.node.CanceledExecutionException; import org.knime.core.node.ExecutionContext; import org.knime.core.node.ExecutionMonitor; import org.knime.core.node.InvalidSettingsException; import org.knime.core.node.NodeLogger; import org.knime.core.node.NodeModel; import org.knime.core.node.NodeSettingsRO; import org.knime.core.node.NodeSettingsWO; import org.knime.core.node.defaultnodesettings.SettingsModelBoolean; import org.knime.core.node.defaultnodesettings.SettingsModelString; import org.knime.core.node.defaultnodesettings.SettingsModelStringArray; import uk.ac.ebi.picr.AccessionMapperInterface; import uk.ac.ebi.picr.AccessionMapperService; import uk.ac.ebi.picr.CrossReference; import uk.ac.ebi.picr.UPEntry; /** * This is the model implementation of PICRAccessor. * Provides access to the Protein Identifier Cross Reference (PICR) web service at EBI * * Some (IMHO) broken code inside Java Web Services throws a NullPointerException to denote something catchable/correctable, * so be sure your exception handling in Eclipse (or whatever IDE you use) does not catch caught NPE's. Sigh... whats wrong with a proper exception type? * * @author Andrew Cassin */ public class PICRAccessorNodeModel extends NodeModel { // the logger instance private static final NodeLogger logger = NodeLogger .getLogger(PICRAccessorNodeModel.class); static final String CFGKEY_TAXON = "taxa"; static final String CFGKEY_DB = "databases"; static final String CFGKEY_ACTIVE_ONLY = "active-only"; static final String CFGKEY_ACCSNS = "accessions"; private final SettingsModelString m_accsns = new SettingsModelString(CFGKEY_ACCSNS, "Accession"); private final SettingsModelString m_taxon = new SettingsModelString(CFGKEY_TAXON, "9606 Homo Sapiens"); private final SettingsModelBoolean m_active_only = new SettingsModelBoolean(CFGKEY_ACTIVE_ONLY, true); private final SettingsModelStringArray m_db = new SettingsModelStringArray(CFGKEY_DB, new String[]{"SWISSPROT"}); private int m_accsn_idx; private static List<String> m_databases = null; private AccessionMapperService m_service; private AccessionMapperInterface m_port; /** * Constructor for the node model. */ protected PICRAccessorNodeModel() { super(1, 1); } /** * {@inheritDoc} */ @Override protected DataTableSpec[] configure(final DataTableSpec[] inSpecs) throws InvalidSettingsException { return new DataTableSpec[] { make_output_spec() }; } private DataTableSpec make_output_spec() { DataColumnSpec[] new_cols = new DataColumnSpec[12]; new_cols[0] = new DataColumnSpecCreator("Sequence (PICR)", StringCell.TYPE).createSpec(); new_cols[1] = new DataColumnSpecCreator("UPI", StringCell.TYPE).createSpec(); new_cols[2] = new DataColumnSpecCreator("Accession (PICR)", StringCell.TYPE).createSpec(); new_cols[3] = new DataColumnSpecCreator("Accession Version", StringCell.TYPE).createSpec(); new_cols[4] = new DataColumnSpecCreator("Database Description", StringCell.TYPE).createSpec(); new_cols[5] = new DataColumnSpecCreator("Database Name", StringCell.TYPE).createSpec(); new_cols[6] = new DataColumnSpecCreator("Date Added", DateAndTimeCell.TYPE).createSpec(); new_cols[7] = new DataColumnSpecCreator("Date Deleted", DateAndTimeCell.TYPE).createSpec(); new_cols[8] = new DataColumnSpecCreator("GI", StringCell.TYPE).createSpec(); new_cols[9] = new DataColumnSpecCreator("Taxon ID", StringCell.TYPE).createSpec(); new_cols[10]= new DataColumnSpecCreator("Accession (user-supplied)", StringCell.TYPE).createSpec(); new_cols[11]= new DataColumnSpecCreator("Cross Reference Type", StringCell.TYPE).createSpec(); return new DataTableSpec(new_cols); } /** * {@inheritDoc} */ @Override protected BufferedDataTable[] execute(final BufferedDataTable[] inData, final ExecutionContext exec) throws Exception { int n_rows = inData[0].getRowCount(); logger.info("Performing conversion of "+n_rows+" accessions via PICR@EBI"); m_accsn_idx = inData[0].getSpec().findColumnIndex(m_accsns.getStringValue()); if (m_accsn_idx < 0) { throw new InvalidSettingsException("Cannot find column: "+m_accsns.getStringValue()); } List<String> dbs = new ArrayList<String>(); for (String tmp : m_db.getStringArrayValue()) { dbs.add(tmp); } logger.info("Searching for mappings into "+dbs.size()+ " EBI databases."); // check required taxa String taxon = m_taxon.getStringValue(); if (taxon == null || taxon.length() < 1 || taxon.toLowerCase().trim().startsWith("any")) { taxon = null; // any species logger.info("Searching for mappings to any species."); } else { Pattern p = Pattern.compile("^\\s*(\\d+)\\b"); Matcher m = p.matcher(taxon); if (!m.find()) { throw new Exception("Invalid or unknown taxonomy: "+taxon); } taxon = m.group(1); logger.info("Search for mappings to NCBI taxonomy ID: "+taxon); } // setup accession web service objects m_service = new AccessionMapperService(); m_port = m_service.getAccessionMapperPort(); // create output table BufferedDataContainer container = exec.createDataContainer(make_output_spec(), true); double done = 0.0; RowIterator it = inData[0].iterator(); int max_batch_size = 25; HashMap<String,String> outstanding_jobs = new HashMap<String,String>(); int hit = 1; while (it.hasNext()) { DataRow r = it.next(); DataCell accsn_cell = r.getCell(m_accsn_idx); if (accsn_cell == null || accsn_cell.isMissing()) continue; if (outstanding_jobs.size() < max_batch_size && it.hasNext()) { outstanding_jobs.put(accsn_cell.toString(), null); } else { // run batch Thread.sleep(5 * 1000); // be nice to EBI servers for (String accsn : outstanding_jobs.keySet()) { List<UPEntry> entries = fetch_entries(accsn, dbs, taxon, m_active_only.getBooleanValue()); for (UPEntry e : entries) { DataCell[] cells = new DataCell[12]; cells[0] = new StringCell(e.getSequence()); cells[1] = new StringCell(e.getUPI()); cells[10]= new StringCell(accsn); cells[11]= new StringCell("identical"); for (CrossReference xref : e.getIdenticalCrossReferences()) { cells[2] = safe_cell(xref.getAccession()); cells[3] = safe_cell(xref.getAccessionVersion()); cells[4] = safe_cell(xref.getDatabaseDescription()); cells[5] = safe_cell(xref.getDatabaseName()); cells[6] = safe_cell(xref.getDateAdded()); cells[7] = safe_cell(xref.getDateDeleted()); cells[8] = safe_cell(xref.getGi()); cells[9] = safe_cell(xref.getTaxonId()); container.addRowToTable(new DefaultRow("Hit"+hit++, cells)); } cells[11] = new StringCell("logical"); for (CrossReference xref : e.getLogicalCrossReferences()) { cells[2] = safe_cell(xref.getAccession()); cells[3] = safe_cell(xref.getAccessionVersion()); cells[4] = safe_cell(xref.getDatabaseDescription()); cells[5] = safe_cell(xref.getDatabaseName()); cells[6] = safe_cell(xref.getDateAdded()); cells[7] = safe_cell(xref.getDateDeleted()); cells[8] = safe_cell(xref.getGi()); cells[9] = safe_cell(xref.getTaxonId()); container.addRowToTable(new DefaultRow("Hit"+hit++, cells)); } } } exec.checkCanceled(); exec.setProgress(done++ / n_rows); } } container.close(); BufferedDataTable out = container.getTable(); return new BufferedDataTable[] {out}; } /** * Fetches PICR data from EBI with retry in case of temporary network failure * @param accsn Accession to find map entries to * @param dbs Databases to search for entries * @param taxon Taxonomy constraint (only single taxon currently available). <code>Null</code> if any species is ok. * @param booleanValue Active entries only? (true means active only) * @return */ private List<UPEntry> fetch_entries(String accsn, List<String> dbs, String taxon, boolean booleanValue) throws Exception { assert(accsn != null && dbs != null && dbs.size() > 0); for (int retry=0; retry<4; retry++) { try { List<UPEntry> entries = m_port.getUPIForAccession(accsn, null, dbs, taxon, m_active_only.getBooleanValue()); return entries; } catch (Exception e) { logger.warn(e.getMessage()); // fall thru } int delay = 500 * (retry+1); logger.warn("Temporary network failure, delaying for "+delay+" seconds"); try { Thread.sleep( delay * 1000); } catch (InterruptedException e) { // ignore } } // persistent network problem... abort throw new Exception("Cannot reach EBI PICR service: aborting!"); } /** * {@inheritDoc} */ @Override protected void reset() { } /** * Ensures that a valid DataCell is returned even if the value parameter is <code>null</code> */ protected DataCell safe_cell(String value) { if (value == null) return DataType.getMissingCell(); return new StringCell(value); } protected DataCell safe_cell(XMLGregorianCalendar cal) { /** * No calendar? return a missing value */ if (cal == null) { return DataType.getMissingCell(); } int year = cal.getYear(); int month= cal.getMonth(); int day = cal.getDay(); /* * return a missing value if the date is incomplete */ if (year == DatatypeConstants.FIELD_UNDEFINED || month == DatatypeConstants.FIELD_UNDEFINED || day == DatatypeConstants.FIELD_UNDEFINED) { return DataType.getMissingCell(); } return new DateAndTimeCell(year, month, day); } /** * {@inheritDoc} */ @Override protected void saveSettingsTo(final NodeSettingsWO settings) { m_accsns.saveSettingsTo(settings); m_db.saveSettingsTo(settings); m_taxon.saveSettingsTo(settings); m_active_only.saveSettingsTo(settings); } /** * {@inheritDoc} */ @Override protected void loadValidatedSettingsFrom(final NodeSettingsRO settings) throws InvalidSettingsException { m_accsns.loadSettingsFrom(settings); m_db.loadSettingsFrom(settings); m_taxon.loadSettingsFrom(settings); m_active_only.loadSettingsFrom(settings); } /** * {@inheritDoc} */ @Override protected void validateSettings(final NodeSettingsRO settings) throws InvalidSettingsException { m_accsns.validateSettings(settings); m_db.validateSettings(settings); m_taxon.validateSettings(settings); m_active_only.validateSettings(settings); } /** * {@inheritDoc} */ @Override protected void loadInternals(final File internDir, final ExecutionMonitor exec) throws IOException, CanceledExecutionException { } /** * {@inheritDoc} */ @Override protected void saveInternals(final File internDir, final ExecutionMonitor exec) throws IOException, CanceledExecutionException { } public static synchronized List<String> load_databases() { if (m_databases == null || m_databases.size() < 1) { logger.info("PICR: loading databases from EBI... please wait a few moments... "); AccessionMapperService service = new AccessionMapperService(); AccessionMapperInterface port = service.getAccessionMapperPort(); m_databases = port.getMappedDatabaseNames(); Collections.sort(m_databases); logger.info("PICR: done loading databases!"); } return m_databases; } }